In [ ]:
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
from impala.dbapi import connect
from impala.util import as_pandas

In [ ]:
# connect to impala
conn = connect(host="mycluster.domain.com", port=my_impala_port_number)
# Make sure we"re pulling from the right location
cur = conn.cursor()
cur.execute("use my_db")

In [ ]:
# Need to select time range:
# Try Aug 1-2, 2016.
# Aug 1 - starttimebox = 1470009600000
# Aug 2 - endtimebox = 1470096000000
# Aug 5 = 1470355200000

Compare if patients with RRTs have similar average vital signs over their visit, compared to non-RRT patients. By this, we're asking if patients with RRTs are sicker than their non-RRT peers

Plot: scatter plot of O2 & Respiratory Rate (two most common vital signs) ... plot mean of values available for each patient (so that way, each O2 value should have a RR partner).

non-RRT patients in 1 color; RRT patients in another

Hypothesis: if you only grab & avg all the observed values, you'll not see a significant difference between the two populations... may want to have more than 1 days worth of data, to capture more events.


In [ ]:
# Following query returns averages of SpO2 & RR values for each encounterid.
# Needs to be reshaped into columnar format.
query_RRT = """
SELECT 
      ce.encntr_id
    , ce.event_cd 
    , cv_event_cd.description AS event_description 
    , AVG(CAST(ce.result_val as int)) as avg 
FROM clinical_event ce  
LEFT OUTER JOIN code_value cv_event_cd 
ON ce.event_cd = cv_event_cd.code_value 
WHERE ce.encntr_id IN ( 
                       SELECT DISTINCT encntr_id 
                       FROM clinical_event 
                       WHERE event_cd = '54411998' 
                       AND result_status_cd NOT IN ('31', '36') 
                       AND valid_until_dt_tm > unix_timestamp() 
                       AND event_class_cd not in ('654645') 
                      ) 
AND ce.event_cd IN ('3623994', '703540')
AND ce.performed_dt_tm > 1470009600000 
AND ce.performed_dt_tm < 1470096000000 
GROUP BY ce.encntr_id, ce.event_cd, cv_event_cd.description 
ORDER BY ce.encntr_id;
"""

In [ ]:
cur.execute(query_RRT)
df_RRT = as_pandas(cur)

In [ ]:
df_RRT.head()

In [ ]:
df_RRT.dtypes

In [ ]:
# O2 (oxygen)
df_O2RRT = df_RRT[df_RRT.event_cd=='3623994']
# Respiration Rate
df_RRRRT = df_RRT[df_RRT.event_cd=='703540']

In [ ]:
df_O2RRT.head()

In [ ]:
df_RRRRT.head()

In [ ]:
df_finRRT = pd.merge(df_O2RRT, df_RRRRT, on='encntr_id')

In [ ]:
df_finRRT.head()

In [ ]:
# Following query returns averages of SpO2 & RR values for each encounterid.
# Needs to be reshaped into columnar format.
query_NotRRT = """
SELECT 
    ce.encntr_id 
    , ce.event_cd
    , cv_event_cd.description AS event_description 
    , AVG(CAST(ce.result_val as int)) as avg 
FROM clinical_event ce  
LEFT OUTER JOIN code_value cv_event_cd 
ON ce.event_cd = cv_event_cd.code_value 
WHERE ce.encntr_id NOT IN ( 
                           SELECT DISTINCT encntr_id 
                           FROM clinical_event 
                           WHERE event_cd = '54411998' 
                           AND result_status_cd NOT IN ('31', '36') 
                           AND valid_until_dt_tm > unix_timestamp() 
                           AND event_class_cd not in ('654645') 
                          ) 
AND ce.event_cd IN ('3623994', '703540')
AND ce.performed_dt_tm > 1470009600000 
AND ce.performed_dt_tm < 1470096000000 
GROUP BY ce.encntr_id, ce.event_cd, cv_event_cd.description 
ORDER BY ce.encntr_id;
"""

In [ ]:
cur.execute(query_NotRRT)
df_NotRRT = as_pandas(cur)

In [ ]:
df_NotRRT.shape

In [ ]:
df_O2NotRRT = df_NotRRT[df_NotRRT.event_cd=='3623994']
df_RRNotRRT = df_NotRRT[df_NotRRT.event_cd=='703540']

In [ ]:
df_finNotRRT = pd.merge(df_O2NotRRT, df_RRNotRRT, on='encntr_id')

In [ ]:
df_finNotRRT.head()

Let's do some pair-wise plots


In [ ]:
plt.figure(figsize=(10, 8))
plt.scatter(df_finNotRRT.avg_x, df_finNotRRT.avg_y, s = 30, alpha=0.5, c='blue')
plt.scatter(df_finRRT.avg_x, df_finRRT.avg_y, s = 30, alpha=0.5, c='red')

In [ ]:
# A helper that takes two event codes and makes a pair plot will be useful

def scatter_avgs(event_cd1, event_cd2):
    ''' 
    Input: event codes are strings. 
    This function returns a figure, of the average of the vital signs per encounter
    '''
    
    query_RRT = """
                SELECT 
                      ce.encntr_id 
                    , ce.event_cd 
                    , cv_event_cd.description AS event_description 
                    , AVG(CAST(ce.result_val as int)) as avg 
                FROM clinical_event ce  
                JOIN encounter enc ON ce.encntr_id = enc.encntr_id 
                LEFT OUTER JOIN code_value cv_event_cd 
                ON ce.event_cd = cv_event_cd.code_value 
                WHERE ce.encntr_id IN ( 
                                       SELECT DISTINCT encntr_id 
                                       FROM clinical_event 
                                       WHERE event_cd = '54411998' 
                                       AND result_status_cd NOT IN ('31', '36') 
                                       AND valid_until_dt_tm > unix_timestamp() 
                                       AND event_class_cd not in ('654645') 
                                      ) 
                AND ce.event_cd IN ('{0}', '{1}')
                AND ce.performed_dt_tm > 1470009600000 
                AND ce.performed_dt_tm < 1470355200000 
                AND enc.loc_facility_cd = '633867'
                GROUP BY ce.encntr_id, ce.event_cd, cv_event_cd.description 
                ORDER BY ce.encntr_id;
                """.format(str(event_cd1), str(event_cd2))
    
    cur.execute(query_RRT)
    df_RRT = as_pandas(cur)
    df_1RRT = df_RRT[df_RRT.event_cd==str(event_cd1)]
    df_2RRT = df_RRT[df_RRT.event_cd==str(event_cd2)]
    df_finRRT = pd.merge(df_1RRT, df_2RRT, on='encntr_id')
    
    query_NotRRT = """
                SELECT 
                      ce.encntr_id 
                    , ce.event_cd 
                    , cv_event_cd.description AS event_description 
                    , AVG(CAST(ce.result_val as int)) as avg 
                FROM clinical_event ce  
                JOIN encounter enc ON ce.encntr_id = enc.encntr_id 
                LEFT OUTER JOIN code_value cv_event_cd 
                ON ce.event_cd = cv_event_cd.code_value 
                WHERE ce.encntr_id NOT IN ( 
                                           SELECT DISTINCT encntr_id 
                                           FROM clinical_event 
                                           WHERE event_cd = '54411998' 
                                           AND result_status_cd NOT IN ('31', '36') 
                                           AND valid_until_dt_tm > unix_timestamp() 
                                           AND event_class_cd not in ('654645') 
                                           ) 
                AND ce.event_cd IN ('{0}', '{1}')
                AND ce.performed_dt_tm > 1470009600000 
                AND ce.performed_dt_tm < 1470355200000 
                AND enc.loc_facility_cd = '633867'
                GROUP BY ce.encntr_id, ce.event_cd, cv_event_cd.description 
                ORDER BY ce.encntr_id;
                """.format(str(event_cd1), str(event_cd2))
    
    cur.execute(query_NotRRT)
    df_NotRRT = as_pandas(cur)
    df_1NotRRT = df_NotRRT[df_NotRRT.event_cd==str(event_cd1)]
    df_2NotRRT = df_NotRRT[df_NotRRT.event_cd==str(event_cd2)]
    df_finNotRRT = pd.merge(df_1NotRRT, df_2NotRRT, on='encntr_id')
    
    # plotting
    plt.figure(figsize=(10, 8))
    plt.scatter(df_finNotRRT.avg_x, df_finNotRRT.avg_y, s = 40, alpha=0.5, c='blue')
    plt.scatter(df_finRRT.avg_x, df_finRRT.avg_y, s = 30, alpha=0.5, c='red')
    plt.xlabel(event_cd1)
    plt.ylabel(event_cd2)
    plt.legend(['Non-RRT patients', 'RRT patients'])
    plt.title('Vital sign averages per encounter')

In [ ]:
scatter_avgs('703501','703516') # x = systolic BP; y = diastolic BP

In [ ]:
scatter_avgs('703558','703306') # x= oral temp, y = mean arterial pressure

In [ ]:
scatter_avgs('703565', '4686698') # glasgow coma score, RASS score

In [ ]:
scatter_avgs('4674677', '2700653') # weight, height

In [ ]:
scatter_avgs('703511', '703540') # pulse rate, respiratory rate
plt.xlabel('pulse rate', fontsize = 16)
plt.ylabel('respiratory rate', fontsize=16)

In [ ]:
scatter_avgs('703540', '4690633') # respiratory rate, CO2
plt.xlabel('Respiratory Rate', fontsize=16)
plt.ylabel('')

In [ ]:
scatter_avgs('703540', '2700541') # respiratory rate, heart rate
plt.xlabel('Respiratory Rate', fontsize=16)
plt.ylabel('Heart Rate', fontsize = 16)

In [ ]:
scatter_avgs('3623994', '4690633') # O2, CO2
plt.xlabel('O2', fontsize = 16)
plt.ylabel('CO2', fontsize = 16)
plt.xlim([86, 101])

In [ ]:
## Earlier assumption of averaging out the data --> correct, 
## no difference once we look at the avgerage vital signs over time.